はじめに

  1. dplyrパッケージを用いて,データの抽出や要約,変換を行う.
  2. geom関数の使い分け
  3. 図をカスタマイズする方法 必要なのはtidyデータとその変数を審美的要素へマッピングすること,データの可視化に最適な図を選び出すこと.

5.1 パイプを使ったデータの集計

geom_bar()関数に任せきりにすると,意図しない図が出力される.よって先に表の形に集計・加工しておく.その際にデータの操作・整形を行うためのdplyrパッケージを用いる.

これから,gss_smデータセットから各地域における信仰に関連する行で集計し,図を作る.

glimpse(gss_sm)
## Rows: 2,867
## Columns: 32
## $ year        <dbl> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 20…
## $ id          <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ ballot      <labelled> 1, 2, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 3, 2, 3, 3, …
## $ age         <dbl> 47, 61, 72, 43, 55, 53, 50, 23, 45, 71, 33, 86, 32, 60, …
## $ childs      <dbl> 3, 0, 2, 4, 2, 2, 2, 3, 3, 4, 5, 4, 3, 5, 7, 2, 6, 5, 0,…
## $ sibs        <labelled> 2, 3, 3, 3, 2, 2, 2, 6, 5, 1, 4, 4, 3, 6, 0, 1, 3, …
## $ degree      <fct> Bachelor, High School, Bachelor, High School, Graduate, …
## $ race        <fct> White, White, White, White, White, White, White, Other, …
## $ sex         <fct> Male, Male, Male, Female, Female, Female, Male, Female, …
## $ region      <fct> New England, New England, New England, New England, New …
## $ income16    <fct> $170000 or over, $50000 to 59999, $75000 to $89999, $170…
## $ relig       <fct> None, None, Catholic, Catholic, None, None, None, Cathol…
## $ marital     <fct> Married, Never Married, Married, Married, Married, Marri…
## $ padeg       <fct> Graduate, Lt High School, High School, NA, Bachelor, NA,…
## $ madeg       <fct> High School, High School, Lt High School, High School, H…
## $ partyid     <fct> "Independent", "Ind,near Dem", "Not Str Republican", "No…
## $ polviews    <fct> Moderate, Liberal, Conservative, Moderate, Slightly Libe…
## $ happy       <fct> Pretty Happy, Pretty Happy, Very Happy, Pretty Happy, Ve…
## $ partners    <fct> NA, 1 Partner, 1 Partner, NA, 1 Partner, 1 Partner, NA, …
## $ grass       <fct> NA, Legal, Not Legal, NA, Legal, Legal, NA, Not Legal, N…
## $ zodiac      <fct> Aquarius, Scorpio, Pisces, Cancer, Scorpio, Scorpio, Cap…
## $ pres12      <labelled> 3, 1, 2, 2, 1, 1, NA, NA, NA, 2, NA, NA, 1, 1, 2, 1…
## $ wtssall     <dbl> 0.9569935, 0.4784968, 0.9569935, 1.9139870, 1.4354903, 0…
## $ income_rc   <fct> Gt $170000, Gt $50000, Gt $75000, Gt $170000, Gt $170000…
## $ agegrp      <fct> Age 45-55, Age 55-65, Age 65+, Age 35-45, Age 45-55, Age…
## $ ageq        <fct> Age 34-49, Age 49-62, Age 62+, Age 34-49, Age 49-62, Age…
## $ siblings    <fct> 2, 3, 3, 3, 2, 2, 2, 6+, 5, 1, 4, 4, 3, 6+, 0, 1, 3, 6+,…
## $ kids        <fct> 3, 0, 2, 4+, 2, 2, 2, 3, 3, 4+, 4+, 4+, 3, 4+, 4+, 2, 4+…
## $ religion    <fct> None, None, Catholic, Catholic, None, None, None, Cathol…
## $ bigregion   <fct> Northeast, Northeast, Northeast, Northeast, Northeast, N…
## $ partners_rc <fct> NA, 1, 1, NA, 1, 1, NA, 1, NA, 3, 1, NA, 1, NA, 0, 1, 0,…
## $ obama       <dbl> 0, 1, 0, 0, 1, 1, NA, NA, NA, 0, NA, NA, 1, 1, 0, 1, 0, …
# 目標: bigregionごとにreligionの割合を表にしたい
rel.by.region <- gss_sm %>% # gss_smデータセットについて
                 dplyr::group_by(bigregion, religion) %>% # bigregionでまとめて,さらにreligionでまとめる
                 dplyr::summarise(N = n()) %>% # まとめたところそれぞれについて個数を集計
                 dplyr::mutate(freq = N/sum(N), # 新たにfreqとpct変数を計算し,結合させる.この時,bigregionのグループ分けは残っている
                               pct = round((freq*100), 0))
## `summarise()` regrouping output by 'bigregion' (override with `.groups` argument)
rel.by.region
## # A tibble: 24 x 5
## # Groups:   bigregion [4]
##    bigregion religion       N    freq   pct
##    <fct>     <fct>      <int>   <dbl> <dbl>
##  1 Northeast Protestant   158 0.324      32
##  2 Northeast Catholic     162 0.332      33
##  3 Northeast Jewish        27 0.0553      6
##  4 Northeast None         112 0.230      23
##  5 Northeast Other         28 0.0574      6
##  6 Northeast <NA>           1 0.00205     0
##  7 Midwest   Protestant   325 0.468      47
##  8 Midwest   Catholic     172 0.247      25
##  9 Midwest   Jewish         3 0.00432     0
## 10 Midwest   None         157 0.226      23
## # … with 14 more rows
# 以下のように,summarise(.groups = "drop")すると,全てのグループが解除され,以下のmutateでは全体の中の割合を求めることになる
hoge <- gss_sm %>%
        dplyr::group_by(bigregion, religion) %>%
        dplyr::summarise(N = n(),
                         .groups = "drop") %>% # ここの一文
        dplyr::mutate(freq = N/sum(N),
                      pct = round((freq * 100), 0))
hoge %>% group_by(bigregion) %>% summarise(total = sum(pct))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 2
##   bigregion total
##   <fct>     <dbl>
## 1 Northeast    18
## 2 Midwest      23
## 3 South        37
## 4 West         21
# 計算の確認.各地域で100%になっているか
rel.by.region %>% group_by(bigregion) %>% summarise(total = sum(pct))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 2
##   bigregion total
##   <fct>     <dbl>
## 1 Northeast   100
## 2 Midwest     101
## 3 South       100
## 4 West        101
# ggplotによる作図
p <- ggplot(data = rel.by.region, 
            mapping = aes(x = bigregion, y = pct, fill = religion))
p1 <- p + geom_col(position = "dodge") # 棒の間がない
p2 <- p + geom_col(position = "dodge2") # 棒の間がある
p2

p.all <- ggpubr::ggarrange(p1, p2)
p.all

# さらに見やすいグラフを作る
# 具体的には,facetを用いて地域ごとに分ける
p <- ggplot(data = rel.by.region, 
            mapping = aes(x = religion, y = pct, fill = religion))
p + geom_col(position = "dodge2") + 
    coord_flip() + 
    labs(x = NULL, y = "Percent", fill = "Religion") + 
    guides(fill = "none") + 
    facet_grid(~ bigregion)

5.2 グループ化・カテゴリ化された連続変数の取り扱い

organdata データセットを用いる.本データセットは,年代・国の構造を持ったデータセットで,17カ国のOECD諸国における移植のための臓器提供意思に関する情報が10年分以上含まれている.いくつかのデータには欠損値を示す“NA”が記されている.

organdata
## # A tibble: 238 x 21
##    country year       donors   pop pop_dens   gdp gdp_lag health health_lag
##    <chr>   <date>      <dbl> <int>    <dbl> <int>   <int>  <dbl>      <dbl>
##  1 Austra… NA          NA    17065    0.220 16774   16591   1300       1224
##  2 Austra… 1991-01-01  12.1  17284    0.223 17171   16774   1379       1300
##  3 Austra… 1992-01-01  12.4  17495    0.226 17914   17171   1455       1379
##  4 Austra… 1993-01-01  12.5  17667    0.228 18883   17914   1540       1455
##  5 Austra… 1994-01-01  10.2  17855    0.231 19849   18883   1626       1540
##  6 Austra… 1995-01-01  10.2  18072    0.233 21079   19849   1737       1626
##  7 Austra… 1996-01-01  10.6  18311    0.237 21923   21079   1846       1737
##  8 Austra… 1997-01-01  10.3  18518    0.239 22961   21923   1948       1846
##  9 Austra… 1998-01-01  10.5  18711    0.242 24148   22961   2077       1948
## 10 Austra… 1999-01-01   8.67 18926    0.244 25445   24148   2231       2077
## # … with 228 more rows, and 12 more variables: pubhealth <dbl>, roads <dbl>,
## #   cerebvas <int>, assault <int>, external <int>, txp_pop <dbl>, world <chr>,
## #   opt <chr>, consent_law <chr>, consent_practice <chr>, consistent <chr>,
## #   ccode <chr>
colnames(organdata)
##  [1] "country"          "year"             "donors"           "pop"             
##  [5] "pop_dens"         "gdp"              "gdp_lag"          "health"          
##  [9] "health_lag"       "pubhealth"        "roads"            "cerebvas"        
## [13] "assault"          "external"         "txp_pop"          "world"           
## [17] "opt"              "consent_law"      "consent_practice" "consistent"      
## [21] "ccode"
glimpse(organdata)
## Rows: 238
## Columns: 21
## $ country          <chr> "Australia", "Australia", "Australia", "Australia",…
## $ year             <date> NA, 1991-01-01, 1992-01-01, 1993-01-01, 1994-01-01…
## $ donors           <dbl> NA, 12.09, 12.35, 12.51, 10.25, 10.18, 10.59, 10.26…
## $ pop              <int> 17065, 17284, 17495, 17667, 17855, 18072, 18311, 18…
## $ pop_dens         <dbl> 0.2204433, 0.2232723, 0.2259980, 0.2282198, 0.23064…
## $ gdp              <int> 16774, 17171, 17914, 18883, 19849, 21079, 21923, 22…
## $ gdp_lag          <int> 16591, 16774, 17171, 17914, 18883, 19849, 21079, 21…
## $ health           <dbl> 1300, 1379, 1455, 1540, 1626, 1737, 1846, 1948, 207…
## $ health_lag       <dbl> 1224, 1300, 1379, 1455, 1540, 1626, 1737, 1846, 194…
## $ pubhealth        <dbl> 4.8, 5.4, 5.4, 5.4, 5.4, 5.5, 5.6, 5.7, 5.9, 6.1, 6…
## $ roads            <dbl> 136.59537, 122.25179, 112.83224, 110.54508, 107.980…
## $ cerebvas         <int> 682, 647, 630, 611, 631, 592, 576, 525, 516, 493, 4…
## $ assault          <int> 21, 19, 17, 18, 17, 16, 17, 17, 16, 15, 16, 15, 14,…
## $ external         <int> 444, 425, 406, 376, 387, 371, 395, 385, 410, 409, 3…
## $ txp_pop          <dbl> 0.9375916, 0.9257116, 0.9145470, 0.9056433, 0.89610…
## $ world            <chr> "Liberal", "Liberal", "Liberal", "Liberal", "Libera…
## $ opt              <chr> "In", "In", "In", "In", "In", "In", "In", "In", "In…
## $ consent_law      <chr> "Informed", "Informed", "Informed", "Informed", "In…
## $ consent_practice <chr> "Informed", "Informed", "Informed", "Informed", "In…
## $ consistent       <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Y…
## $ ccode            <chr> "Oz", "Oz", "Oz", "Oz", "Oz", "Oz", "Oz", "Oz", "Oz…
organdata %>% select(1:6) %>% slice_sample(n = 10) # ランダムにデータをとってくる
## # A tibble: 10 x 6
##    country        year       donors   pop pop_dens   gdp
##    <chr>          <date>      <dbl> <int>    <dbl> <int>
##  1 Belgium        1997-01-01   22.5 10181   30.8   22936
##  2 Canada         1994-01-01   13.9 29036    0.291 21428
##  3 France         NA           NA   56709   10.3   18162
##  4 Switzerland    1995-01-01   13    7041   17.1   26304
##  5 United Kingdom 1998-01-01   12.3 58440   24.1   23343
##  6 United Kingdom 1995-01-01   14.4 58005   23.9   19998
##  7 Germany        1995-01-01   12.8 81678   22.9   21411
##  8 Denmark        2002-01-01   12.7  5376   12.5   29228
##  9 Netherlands    1999-01-01   10.9 15812   38.1   25438
## 10 Belgium        2001-01-01   22.2 10287   31.1   27113
# 年に対するドナーの数をかく
p <- ggplot(data = organdata, 
            mapping = aes(x = year, y = donors))
# よくわからない
p + geom_point()
## Warning: Removed 34 rows containing missing values (geom_point).

# 国別のドナーの数の時系列変化
p <- ggplot(data = organdata, 
            mapping = aes(x = year, y = donors))
p + geom_line(mapping = aes(group = country)) + 
    facet_wrap(~ country)
## Warning: Removed 34 row(s) containing missing values (geom_path).

# 国別の箱ひげ図
p <- ggplot(data = organdata,
            mapping = aes(x = country, y = donors))
p + geom_boxplot() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) # x軸の国名を45ºにした
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).

# 横軸にする
p <- ggplot(data = organdata, 
            mapping = aes(x = country, y = donors))
p + geom_boxplot() + coord_flip()
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).

# アルファベット順から大きい順に並び替える
# reorder関数を用いる.最初の引数のカテゴリを2番目の引数の平均値(設定可能)で並び替える
p <- ggplot(data = organdata, 
            mapping = aes(x = reorder(country, donors, na.rm = TRUE),
                          y = donors))
p + geom_boxplot() + 
    labs(x = NULL) + 
    coord_flip()
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).

# バイオリンプロット
p + geom_violin() + 
    coord_flip()
## Warning: Removed 34 rows containing non-finite values (stat_ydensity).

# fillを使って色分け
p <- ggplot(data = organdata, 
            mapping = aes(reorder(country, donors, na.rm = TRUE),
                          y = donors, fill = world))
p + geom_boxplot() + 
    labs(x = NULL) + 
    coord_flip() + 
    theme(legend.position = "top")
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).

# 箱ひげ図の代わりに全ての点をプロットするのも良い
p <- ggplot(data = organdata,
            mapping = aes(reorder(country, donors, na.rm = TRUE),
                          y = donors, color = world))
p + geom_point(alpha = 0.3) + 
    labs(x = NULL) + 
    coord_flip() + 
    theme(legend.position = "top")
## Warning: Removed 34 rows containing missing values (geom_point).

# 点が重なって見にくい場合はゆらぎを与える
p <- ggplot(data = organdata, 
            mapping = aes(reorder(country, donors, na.rm = TRUE),
                          y = donors, color = world))
p0 <- p + geom_point() + 
      labs(x = NULL, title = "normal") + 
      coord_flip() + 
      theme(legend.position = "top")
p1 <- p + geom_jitter(position = position_jitter(width = 0.15)) + 
      labs(x = NULL, title = "width") + 
      coord_flip() + 
      theme(legend.position = "top")
p2 <- p + geom_jitter(position = position_jitter(width = 0.15, height = 0.15)) + 
      labs(x = NULL, title = "width & height") + 
      coord_flip() + 
      theme(legend.position = "top")
p3 <- p + geom_jitter(position = position_jitter(height = 0.15)) + 
      labs(x = NULL, title = "height") + 
      coord_flip() +
      theme(legend.position = "top")
p.all <- ggpubr::ggarrange(p0, p1, p2, p3)
## Warning: Removed 34 rows containing missing values (geom_point).

## Warning: Removed 34 rows containing missing values (geom_point).

## Warning: Removed 34 rows containing missing values (geom_point).

## Warning: Removed 34 rows containing missing values (geom_point).
p.all

# jitterのheigthで上下方向の点の散らばりを,widthで左右方向の散らばりを調節
# この場合,heightをいじるとy軸(donors)が動くので実際の大きさとズレてしまう

# クリーブランドドットプロットを作成する
# 国ごとの臓器提供率の平均を用いる
# まずdplyrを用いてデータを作成する
organdata$consent_law
##   [1] "Informed" "Informed" "Informed" "Informed" "Informed" "Informed"
##   [7] "Informed" "Informed" "Informed" "Informed" "Informed" "Informed"
##  [13] "Informed" "Informed" "Presumed" "Presumed" "Presumed" "Presumed"
##  [19] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
##  [25] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
##  [31] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
##  [37] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
##  [43] "Informed" "Informed" "Informed" "Informed" "Informed" "Informed"
##  [49] "Informed" "Informed" "Informed" "Informed" "Informed" "Informed"
##  [55] "Informed" "Informed" "Informed" "Informed" "Informed" "Informed"
##  [61] "Informed" "Informed" "Informed" "Informed" "Informed" "Informed"
##  [67] "Informed" "Informed" "Informed" "Informed" "Presumed" "Presumed"
##  [73] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
##  [79] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
##  [85] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
##  [91] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
##  [97] "Presumed" "Presumed" "Informed" "Informed" "Informed" "Informed"
## [103] "Informed" "Informed" "Informed" "Informed" "Informed" "Informed"
## [109] "Informed" "Informed" "Informed" "Informed" "Presumed" "Presumed"
## [115] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
## [121] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
## [127] "Informed" "Informed" "Informed" "Informed" "Informed" "Informed"
## [133] "Informed" "Informed" "Informed" "Informed" "Informed" "Informed"
## [139] "Informed" "Informed" "Informed" "Informed" "Informed" "Informed"
## [145] "Informed" "Informed" "Informed" "Informed" "Informed" "Informed"
## [151] "Informed" "Informed" "Informed" "Informed" "Presumed" "Presumed"
## [157] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
## [163] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
## [169] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
## [175] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
## [181] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
## [187] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
## [193] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
## [199] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
## [205] "Presumed" "Presumed" "Presumed" "Presumed" "Presumed" "Presumed"
## [211] "Informed" "Informed" "Informed" "Informed" "Informed" "Informed"
## [217] "Informed" "Informed" "Informed" "Informed" "Informed" "Informed"
## [223] "Informed" "Informed" "Informed" "Informed" "Informed" "Informed"
## [229] "Informed" "Informed" "Informed" "Informed" "Informed" "Informed"
## [235] "Informed" "Informed" "Informed" "Informed"
colnames(organdata)
##  [1] "country"          "year"             "donors"           "pop"             
##  [5] "pop_dens"         "gdp"              "gdp_lag"          "health"          
##  [9] "health_lag"       "pubhealth"        "roads"            "cerebvas"        
## [13] "assault"          "external"         "txp_pop"          "world"           
## [17] "opt"              "consent_law"      "consent_practice" "consistent"      
## [21] "ccode"
by.country <- organdata %>%
  group_by(consent_law, country) %>%
  summarise(donors.mean = mean(donors, na.rm = TRUE),
            donors.sd = sd(donors, na.rm = TRUE),
            gdp.mean = mean(gdp, na.rm = TRUE),
            health.mean = mean(health, na.rm = TRUE),
            roads.mean = mean(roads, na.rm = TRUE),
            cerebvas.mean = mean(cerebvas, na.rm = TRUE)
            )
## `summarise()` regrouping output by 'consent_law' (override with `.groups` argument)
by.country
## # A tibble: 17 x 8
## # Groups:   consent_law [2]
##    consent_law country donors.mean donors.sd gdp.mean health.mean roads.mean
##    <chr>       <chr>         <dbl>     <dbl>    <dbl>       <dbl>      <dbl>
##  1 Informed    Austra…        10.6     1.14    22179.       1958.      105. 
##  2 Informed    Canada         14.0     0.751   23711.       2272.      109. 
##  3 Informed    Denmark        13.1     1.47    23722.       2054.      102. 
##  4 Informed    Germany        13.0     0.611   22163.       2349.      113. 
##  5 Informed    Ireland        19.8     2.48    20824.       1480.      118. 
##  6 Informed    Nether…        13.7     1.55    23013.       1993.       76.1
##  7 Informed    United…        13.5     0.775   21359.       1561.       67.9
##  8 Informed    United…        20.0     1.33    29212.       3988.      155. 
##  9 Presumed    Austria        23.5     2.42    23876.       1875.      150. 
## 10 Presumed    Belgium        21.9     1.94    22500.       1958.      155. 
## 11 Presumed    Finland        18.4     1.53    21019.       1615.       93.6
## 12 Presumed    France         16.8     1.60    22603.       2160.      156. 
## 13 Presumed    Italy          11.1     4.28    21554.       1757       122. 
## 14 Presumed    Norway         15.4     1.11    26448.       2217.       70.0
## 15 Presumed    Spain          28.1     4.96    16933        1289.      161. 
## 16 Presumed    Sweden         13.1     1.75    22415.       1951.       72.3
## 17 Presumed    Switze…        14.2     1.71    27233        2776.       96.4
## # … with 1 more variable: cerebvas.mean <dbl>
# 年でグループ分けした
by.year <- organdata %>% 
  group_by(consent_law, year) %>%
  summarise(donors.mean = mean(donors, na.rm = TRUE),
            donors.sd = mean(donors, na.rm = TRUE))
## `summarise()` regrouping output by 'consent_law' (override with `.groups` argument)
by.year
## # A tibble: 26 x 4
## # Groups:   consent_law [2]
##    consent_law year       donors.mean donors.sd
##    <chr>       <date>           <dbl>     <dbl>
##  1 Informed    1991-01-01        14.7      14.7
##  2 Informed    1992-01-01        15.2      15.2
##  3 Informed    1993-01-01        15.0      15.0
##  4 Informed    1994-01-01        14.5      14.5
##  5 Informed    1995-01-01        15.6      15.6
##  6 Informed    1996-01-01        14.6      14.6
##  7 Informed    1997-01-01        14.7      14.7
##  8 Informed    1998-01-01        14.8      14.8
##  9 Informed    1999-01-01        14.1      14.1
## 10 Informed    2000-01-01        14.4      14.4
## # … with 16 more rows
# 繰り返しがこれはエレガントではない
# さらに,他の情報(worldなど)が失われる
by.country <- organdata %>%
  group_by(consent_law, country) %>%
  summarize_if(is.numeric, list(mean = mean, sd = sd), na.rm = TRUE) %>%
  ungroup()
by.country
## # A tibble: 17 x 28
##    consent_law country donors_mean pop_mean pop_dens_mean gdp_mean gdp_lag_mean
##    <chr>       <chr>         <dbl>    <dbl>         <dbl>    <dbl>        <dbl>
##  1 Informed    Austra…        10.6   18318.         0.237   22179.       21779.
##  2 Informed    Canada         14.0   29608.         0.297   23711.       23353.
##  3 Informed    Denmark        13.1    5257.        12.2     23722.       23275 
##  4 Informed    Germany        13.0   80255.        22.5     22163.       21938.
##  5 Informed    Ireland        19.8    3674.         5.23    20824.       20154.
##  6 Informed    Nether…        13.7   15548.        37.4     23013.       22554.
##  7 Informed    United…        13.5   58187.        24.0     21359.       20962.
##  8 Informed    United…        20.0  269330.         2.80    29212.       28699.
##  9 Presumed    Austria        23.5    7927.         9.45    23876.       23415.
## 10 Presumed    Belgium        21.9   10153.        30.7     22500.       22096.
## 11 Presumed    Finland        18.4    5112.         1.51    21019.       20763 
## 12 Presumed    France         16.8   58056.        10.5     22603.       22211.
## 13 Presumed    Italy          11.1   57360.        19.0     21554.       21195.
## 14 Presumed    Norway         15.4    4386.         1.35    26448.       25769.
## 15 Presumed    Spain          28.1   39666.         7.84    16933        16584.
## 16 Presumed    Sweden         13.1    8789.         1.95    22415.       22094 
## 17 Presumed    Switze…        14.2    7037.        17.0     27233        26931.
## # … with 21 more variables: health_mean <dbl>, health_lag_mean <dbl>,
## #   pubhealth_mean <dbl>, roads_mean <dbl>, cerebvas_mean <dbl>,
## #   assault_mean <dbl>, external_mean <dbl>, txp_pop_mean <dbl>,
## #   donors_sd <dbl>, pop_sd <dbl>, pop_dens_sd <dbl>, gdp_sd <dbl>,
## #   gdp_lag_sd <dbl>, health_sd <dbl>, health_lag_sd <dbl>, pubhealth_sd <dbl>,
## #   roads_sd <dbl>, cerebvas_sd <dbl>, assault_sd <dbl>, external_sd <dbl>,
## #   txp_pop_sd <dbl>
p <- ggplot(data = by.country, 
            mapping = aes(x = donors_mean, y = reorder(country, donors_mean, na.rm = TRUE),
                          color = consent_law))
p + geom_point(size = 3) + 
    labs(x = NULL) + 
    theme(legend.position = "top")

# 標準偏差を付けたい場合
p1 <- ggplot(data = by.country, 
            mapping = aes(x = donors_mean, 
                          y = reorder(country, donors_mean, na.rm = TRUE),
                          color = consent_law))
p1 <- p1 + geom_pointrange(mapping = aes(xmin = donors_mean - donors_sd, xmax = donors_mean + donors_sd)) + 
      labs(x = "Donor Procurement Rate", y = "")
p2 <- ggplot(data = by.country, 
            mapping = aes(x = reorder(country, donors_mean, na.rm = TRUE),
                          y = donors_mean, 
                          color = consent_law))
p2 <- p2 + geom_pointrange(mapping = aes(ymin = donors_mean - donors_sd, ymax = donors_mean + donors_sd)) + 
          labs(x = "", y = "Donor Procurement Rate") + 
          coord_flip()
p.all <- ggpubr::ggarrange(p1, p2)
p.all

# facetを使ってconsent_lawの違いでグラフを分ける
# facet_wrap(scales = "free_x/y")を設定することでそれぞれ独立にx, yを設定する
p <- ggplot(data = by.country, 
            mapping = aes(x = donors_mean,
                          y = reorder(country, donors_mean, na.rm = TRUE)))
p + geom_point(size = 3) + 
    facet_wrap(~consent_law, 
               scales = "free_y", # デフォルトでは両方の図に全ての国名が表示されるので消す
               ncol = 1)

5.3 図にテキストを直接描画する

p <- ggplot(data = by.country, 
            mapping = aes(x = roads_mean, y = donors_mean))
p + geom_point() + 
    geom_text(mapping = aes(label = country),
              hjust = 0 # ラベルの位置を右側に調整する
              )

# これらは見にくい
# もっと良い方法がある
# ggrepelパッケージを用いる
p <- ggplot(data = by.country,
            mapping = aes(x = roads_mean, y = donors_mean, label = country))
p + geom_point() + 
    ggrepel::geom_text_repel()

p + geom_point() + 
    ggrepel::geom_label_repel()

# elections_historic: 過去のアメリカ大統領選挙に関するデータセット
# socvizパッケージに入っている
elections_historic %>% select(2:8) %>% slice_sample(n = 10)
## # A tibble: 10 x 7
##     year winner              win_party ec_pct popular_pct popular_margin   votes
##    <int> <chr>               <chr>      <dbl>       <dbl>          <dbl>   <int>
##  1  1884 Grover Cleveland    Dem.       0.546       0.488         0.0057  4.91e6
##  2  1824 John Quincy Adams   D.-R.      0.322       0.309        -0.104   1.13e5
##  3  1912 Woodrow Wilson      Dem.       0.819       0.418         0.144   6.30e6
##  4  1840 William Henry Harr… Whig       0.796       0.529         0.0605  1.28e6
##  5  2004 George W. Bush      Rep.       0.532       0.507         0.0246  6.20e7
##  6  1988 George H. W. Bush   Rep.       0.792       0.534         0.0772  4.89e7
##  7  1948 Harry Truman        Dem.       0.571       0.496         0.0448  2.42e7
##  8  1856 James Buchanan      Dem.       0.588       0.453         0.122   1.84e6
##  9  1864 Abraham Lincoln     Rep.       0.910       0.550         0.101   2.21e6
## 10  1980 Ronald Reagan       Rep.       0.909       0.507         0.0974  4.39e7
# -----------------------
# ここから命名方法を.ではなく_を使うことにします
# -----------------------

p_title <- "Presidential Elections: Popular & Electoral College Margins"
p_subtitle <- "1824-2016"
p_caption <- "Data for 2016 are provisional"
x_label <- "Winner's share of Popular Vote"
y_label <- "Winner's share of Electoral College Votes"
p <- ggplot(data = elections_historic,
            mapping = aes(x = popular_pct, y = ec_pct, label = winner_label))
p + geom_hline(yintercept = 0.5, size = 1.4, color = "gray80") + 
    geom_vline(xintercept = 0.5, size = 1.4, color = "gray80") + 
    geom_point() + 
    geom_text_repel() + 
    scale_x_continuous(labels = scales::percent) + 
    scale_y_continuous(labels = scales::percent) + 
    labs(x = x_label, y = y_label, title = p_title, subtitle = p_subtitle, caption = p_caption)
## Warning: ggrepel: 15 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

5.4 特定のデータへのラベリング

organdata %>% slice_sample(n = 10)
## # A tibble: 10 x 21
##    country year       donors   pop pop_dens   gdp gdp_lag health health_lag
##    <chr>   <date>      <dbl> <int>    <dbl> <int>   <int>  <dbl>      <dbl>
##  1 Austra… 1996-01-01   10.6 18311    0.237 21923   21079   1846       1737
##  2 Norway  1991-01-01   15.2  4262    1.32  19134   17905   1542       1385
##  3 Denmark 2002-01-01   12.7  5376   12.5   29228   29203   2580       2520
##  4 Canada  1997-01-01   14.2 29987    0.301 23949   22764   2130       2039
##  5 Denmark 1994-01-01   12.9  5206   12.1   21494   20056   1834       1757
##  6 Norway  NA           NA    4242    1.31  17905   16942   1385       1297
##  7 Austria 1992-01-01   23.1  7841    9.35  20601   19860   1551       1419
##  8 United… NA           NA      NA   NA        NA   27959   2308       2160
##  9 Switze… 1998-01-01   15.4  7110   17.2   28733   27675   2967       2812
## 10 Austria 1995-01-01   21.5  7948    9.48  22817   21940   1865       1739
## # … with 12 more variables: pubhealth <dbl>, roads <dbl>, cerebvas <int>,
## #   assault <int>, external <int>, txp_pop <dbl>, world <chr>, opt <chr>,
## #   consent_law <chr>, consent_practice <chr>, consistent <chr>, ccode <chr>
by_country <- organdata %>%
  group_by(consent_law, country) %>%
  summarize_if(is.numeric, list(mean = mean, sd = sd), na.rm = TRUE) %>%
  ungroup()
by_country
## # A tibble: 17 x 28
##    consent_law country donors_mean pop_mean pop_dens_mean gdp_mean gdp_lag_mean
##    <chr>       <chr>         <dbl>    <dbl>         <dbl>    <dbl>        <dbl>
##  1 Informed    Austra…        10.6   18318.         0.237   22179.       21779.
##  2 Informed    Canada         14.0   29608.         0.297   23711.       23353.
##  3 Informed    Denmark        13.1    5257.        12.2     23722.       23275 
##  4 Informed    Germany        13.0   80255.        22.5     22163.       21938.
##  5 Informed    Ireland        19.8    3674.         5.23    20824.       20154.
##  6 Informed    Nether…        13.7   15548.        37.4     23013.       22554.
##  7 Informed    United…        13.5   58187.        24.0     21359.       20962.
##  8 Informed    United…        20.0  269330.         2.80    29212.       28699.
##  9 Presumed    Austria        23.5    7927.         9.45    23876.       23415.
## 10 Presumed    Belgium        21.9   10153.        30.7     22500.       22096.
## 11 Presumed    Finland        18.4    5112.         1.51    21019.       20763 
## 12 Presumed    France         16.8   58056.        10.5     22603.       22211.
## 13 Presumed    Italy          11.1   57360.        19.0     21554.       21195.
## 14 Presumed    Norway         15.4    4386.         1.35    26448.       25769.
## 15 Presumed    Spain          28.1   39666.         7.84    16933        16584.
## 16 Presumed    Sweden         13.1    8789.         1.95    22415.       22094 
## 17 Presumed    Switze…        14.2    7037.        17.0     27233        26931.
## # … with 21 more variables: health_mean <dbl>, health_lag_mean <dbl>,
## #   pubhealth_mean <dbl>, roads_mean <dbl>, cerebvas_mean <dbl>,
## #   assault_mean <dbl>, external_mean <dbl>, txp_pop_mean <dbl>,
## #   donors_sd <dbl>, pop_sd <dbl>, pop_dens_sd <dbl>, gdp_sd <dbl>,
## #   gdp_lag_sd <dbl>, health_sd <dbl>, health_lag_sd <dbl>, pubhealth_sd <dbl>,
## #   roads_sd <dbl>, cerebvas_sd <dbl>, assault_sd <dbl>, external_sd <dbl>,
## #   txp_pop_sd <dbl>
# gdp_meanが25,000以上のところだけラベル(国名)をつける
p <- ggplot(data = by_country,
            mapping = aes(x = gdp_mean, y = health_mean))
p + geom_point() + 
    geom_text_repel(data = subset(by_country, gdp_mean > 25000),
                    mapping = aes(label = country))

# gdp_meanが25,000以上,health_meanが1,500以下,ベルギーにラベルをつける
p <- ggplot(data = by_country, 
            mapping = aes(x = gdp_mean, y = health_mean))
p + geom_point() + 
    geom_text_repel(data = subset(by_country, 
                                  gdp_mean > 25000 | health_mean < 1500 | country %in% "Belgium"),
                    mapping = aes(label = country))

# ダミー変数を使う方法もある
organdata$ind <- organdata$ccode %in% c("Ita", "Spa") & organdata$year > 1998
organdata$ind
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE    NA  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [121]  TRUE  TRUE  TRUE  TRUE  TRUE    NA FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169]    NA  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [181]  TRUE    NA FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
p <- ggplot(data = organdata, 
            mapping = aes(x = roads, y = donors, color = ind))
p + geom_point() + 
    geom_text_repel(data = subset(organdata, ind),
                    mapping = aes(label = ccode)) + 
    guides(label = "none", color = "none")
## Warning: Removed 34 rows containing missing values (geom_point).

5.5 図内への描画と書き込み

annotate()関数を用いる

p <- ggplot(data = organdata, 
            mapping = aes(x = roads, y = donors))

# 図に文字を書く
p + geom_point() + 
    annotate(geom = "text", 
             x = 91, y = 33,
             label = "A surprisingly high \n recovery rate.",
             hjust = 0)
## Warning: Removed 34 rows containing missing values (geom_point).

# 図に色を書き込む
p + geom_point() + 
    annotate(geom = "rect", 
             xmin = 125, xmax = 155, 
             ymin = 30, ymax = 35,
             fill = "red", alpha = 0.2) + 
    annotate(geom = "text", 
             x = 157, y = 33,
             label = "A surprisingly high \n recovery rate.",
             hjust = 0)
## Warning: Removed 34 rows containing missing values (geom_point).

5.6 scale_関数・guides()関数・theme()関数

scale関数の命名規則 scale__() 例 - scale_x_continuous: 連続変数xのスケールを調節する - scale_y_discrete: 離散変数yのスケールを調節する - scale_x_log10: 変数xを対数変換する

p <- ggplot(data = organdata, 
            mapping = aes(x = roads, y = donors, color = world))
p1 <- p + geom_point()

# x軸を対数に,y軸のラベルと位置を書き換える
p <- ggplot(data = organdata, 
            mapping = aes(x = roads, y = donors, color = world))
p2 <- p + geom_point() + 
    scale_x_log10() + 
    scale_y_continuous(breaks = c(5, 15, 25), 
                       labels = c("Five", "Fifteen", "Twenty Five"))
ggpubr::ggarrange(p1, p2)
## Warning: Removed 34 rows containing missing values (geom_point).

## Warning: Removed 34 rows containing missing values (geom_point).